mrrc 0.7.6

A Rust library for reading, writing, and manipulating MARC bibliographic records in ISO 2709 binary format
Documentation
"""
Record Boundary Scanner Tests

Tests the RecordBoundaryScanner Rust module for accurate MARC record
boundary detection using 0x1D (record terminator) delimiters. These tests verify:

- Correct identification of record boundaries
- Proper length calculation (including terminator)
- Batch limiting functionality
- Integration with existing MARC readers
- Performance characteristics for large buffers
"""

import pytest
from mrrc import MARCReader, RecordBoundaryScanner


@pytest.fixture
def simple_book_bytes():
    """Read simple_book.mrc as raw bytes."""
    with open("tests/data/simple_book.mrc", "rb") as f:
        return f.read()


@pytest.fixture
def multi_records_bytes():
    """Read multi_records.mrc as raw bytes."""
    with open("tests/data/multi_records.mrc", "rb") as f:
        return f.read()


class TestBoundaryScannerBasics:
    """Test basic boundary scanner functionality."""

    def test_scanner_creation(self):
        """Verify scanner can be instantiated."""
        scanner = RecordBoundaryScanner()
        assert scanner is not None

    def test_scan_single_record(self):
        """Scan a single record with terminator."""
        data = bytes([1, 2, 3, 0x1D])  # 0x1D = record terminator
        scanner = RecordBoundaryScanner()
        boundaries = scanner.scan(data)

        assert len(boundaries) == 1
        assert boundaries[0] == (0, 4)

    def test_scan_multiple_records(self):
        """Scan multiple records with distinct boundaries."""
        data = bytes([1, 2, 0x1D, 3, 4, 0x1D, 5, 0x1D])  # 0x1D = record terminator
        scanner = RecordBoundaryScanner()
        boundaries = scanner.scan(data)

        assert len(boundaries) == 3
        assert boundaries[0] == (0, 3)
        assert boundaries[1] == (3, 3)
        assert boundaries[2] == (6, 2)

    def test_scan_empty_buffer(self):
        """Empty buffer should raise error."""
        scanner = RecordBoundaryScanner()
        with pytest.raises(Exception):  # MarcError
            scanner.scan(b"")

    def test_scan_no_terminators(self):
        """Buffer with no record terminators should raise error."""
        data = bytes([1, 2, 3, 4])  # No 0x1D terminators
        scanner = RecordBoundaryScanner()
        with pytest.raises(Exception):  # MarcError
            scanner.scan(data)


class TestBoundaryScannerRealData:
    """Test boundary scanner with real MARC data."""

    def test_scan_simple_book(self, simple_book_bytes):
        """Scan simple_book.mrc and verify record count."""
        scanner = RecordBoundaryScanner()
        boundaries = scanner.scan(simple_book_bytes)

        # simple_book.mrc contains 1 record
        assert len(boundaries) >= 1, "Should find at least one record"
        # First record should start at offset 0
        assert boundaries[0][0] == 0, "First record should start at offset 0"
        # All boundaries should end before file end
        for offset, length in boundaries:
            assert offset + length <= len(simple_book_bytes), \
                "Record boundary exceeds file size"

    def test_scan_multi_records(self, multi_records_bytes):
        """Scan multi_records.mrc and verify record count."""
        scanner = RecordBoundaryScanner()
        boundaries = scanner.scan(multi_records_bytes)

        # multi_records.mrc contains multiple records
        assert len(boundaries) > 1
        # Verify all boundaries are valid (offsets in increasing order)
        offsets = [b[0] for b in boundaries]
        assert offsets == sorted(offsets)

    def test_boundary_reconstruction(self, simple_book_bytes):
        """Verify boundaries point to valid record data."""
        scanner = RecordBoundaryScanner()
        boundaries = scanner.scan(simple_book_bytes)

        # Verify boundaries are valid
        for offset, length in boundaries:
            record_bytes = simple_book_bytes[offset : offset + length]
            # Should have content
            assert len(record_bytes) > 0, "Record should have content"
            # Should end with record terminator (0x1D)
            assert record_bytes[-1] == 0x1D, "Record should end with 0x1D record terminator"
            # Should not exceed file bounds
            assert offset + length <= len(simple_book_bytes), \
                "Record should not exceed file boundaries"


class TestBoundaryScannerLimiting:
    """Test boundary scanner limiting functionality."""

    def test_scan_limited(self):
        """Verify scan_limited returns up to limit records."""
        data = bytes([1, 0x1D, 2, 0x1D, 3, 0x1D])  # 0x1D = record terminator
        scanner = RecordBoundaryScanner()
        boundaries = scanner.scan_limited(data, 2)

        assert len(boundaries) == 2
        assert boundaries[0] == (0, 2)
        assert boundaries[1] == (2, 2)

    def test_scan_limited_exceeds_available(self):
        """Verify scan_limited returns fewer records if not enough available."""
        data = bytes([1, 0x1D, 2, 0x1D])  # 0x1D = record terminator
        scanner = RecordBoundaryScanner()
        boundaries = scanner.scan_limited(data, 10)

        # Should return only 2 even though limit is 10
        assert len(boundaries) == 2

    def test_scan_limited_batch_processing(self, multi_records_bytes):
        """Verify scan_limited works for batch processing."""
        scanner = RecordBoundaryScanner()

        # Scan all records
        all_boundaries = scanner.scan(multi_records_bytes)
        total_records = len(all_boundaries)

        # Verify we found records
        assert total_records > 0, "Should find at least one record"
        
        # Test that scan_limited returns correct number
        half = (total_records + 1) // 2
        limited_boundaries = scanner.scan_limited(multi_records_bytes, half)
        assert len(limited_boundaries) <= half, \
            f"Should return at most {half} records, got {len(limited_boundaries)}"
        assert len(limited_boundaries) > 0, "Should return at least one record"


class TestBoundaryScannerCounting:
    """Test record counting functionality."""

    def test_count_records(self):
        """Verify count_records returns correct terminator count."""
        data = bytes([1, 0x1D, 2, 0x1D])  # 0x1D = record terminator
        scanner = RecordBoundaryScanner()

        count = scanner.count_records(data)
        assert count == 2

    def test_count_records_empty(self):
        """Count in empty buffer should return 0."""
        scanner = RecordBoundaryScanner()
        count = scanner.count_records(b"")
        assert count == 0

    def test_count_records_no_terminators(self):
        """Count with no terminators should return 0."""
        data = bytes([1, 2, 3, 4])
        scanner = RecordBoundaryScanner()
        count = scanner.count_records(data)
        assert count == 0

    def test_count_matches_scan(self, multi_records_bytes):
        """Verify count_records matches length of scan results."""
        scanner = RecordBoundaryScanner()

        count = scanner.count_records(multi_records_bytes)
        boundaries = scanner.scan(multi_records_bytes)

        assert count == len(boundaries)


class TestBoundaryScannerPerformance:
    """Test performance characteristics."""

    def test_large_buffer_scan(self):
        """Verify scanner handles large buffers efficiently."""
        # Create a buffer with 1000 records
        data = bytearray()
        for i in range(1000):
            data.append(0x01 if i % 2 == 0 else 0x02)
            data.append(0x1D)  # 0x1D = record terminator

        scanner = RecordBoundaryScanner()
        boundaries = scanner.scan(bytes(data))

        assert len(boundaries) == 1000
        # Spot check first and last
        assert boundaries[0] == (0, 2)
        assert boundaries[999] == (1998, 2)

    def test_scanner_reuse(self):
        """Verify scanner can be reused without issues."""
        scanner = RecordBoundaryScanner()

        # First scan
        data1 = bytes([1, 0x1D, 2, 0x1D])  # 0x1D = record terminator
        boundaries1 = scanner.scan(data1)
        assert len(boundaries1) == 2

        # Second scan (should clear internal state)
        data2 = bytes([1, 0x1D])
        boundaries2 = scanner.scan(data2)
        assert len(boundaries2) == 1
        # Verify no cross-contamination
        assert boundaries2[0] == (0, 2)

    def test_count_vs_scan_performance(self, multi_records_bytes):
        """Verify count_records is efficient for just counting."""
        scanner = RecordBoundaryScanner()

        # count_records should be faster than full scan
        count = scanner.count_records(multi_records_bytes)
        boundaries = scanner.scan(multi_records_bytes)

        assert count == len(boundaries)


class TestBoundaryScannerIntegration:
    """Test integration with existing MARC readers."""

    def test_boundaries_enable_parallel_parsing(self, multi_records_bytes):
        """Verify boundaries are suitable for parallel record processing."""
        scanner = RecordBoundaryScanner()
        boundaries = scanner.scan(multi_records_bytes)
        
        assert len(boundaries) > 0, "Should find at least one record"
        
        # Verify boundaries are non-overlapping
        for i, (offset1, len1) in enumerate(boundaries):
            for j, (offset2, len2) in enumerate(boundaries):
                if i != j:
                    # Records should not overlap
                    end1 = offset1 + len1
                    assert end1 <= offset2 or offset2 + len2 <= offset1, \
                        f"Records {i} and {j} overlap"

    def test_sequential_vs_boundary_parsing(self, multi_records_bytes):
        """Verify boundaries are consistent with sequential parsing."""
        # Sequential parsing identifies complete records
        reader = MARCReader(multi_records_bytes)
        sequential_records = []
        while True:
            record = reader.read_record()
            if record is None:
                break
            sequential_records.append(record)

        assert len(sequential_records) > 0, "Should find records via sequential parsing"

        # Boundary-based scan finds record delimiters
        scanner = RecordBoundaryScanner()
        boundaries = scanner.scan(multi_records_bytes)

        # Both methods should find records/boundaries
        assert len(boundaries) > 0, "Boundary scan should find record boundaries"
        # Boundary scan typically finds >= sequential parser (may find partial records)
        assert len(boundaries) >= len(sequential_records), \
            "Boundary scan should find >= complete records"


class TestBoundaryScannerAcceptanceCriteria:
    """Test boundary scanner acceptance criteria."""

    def test_accepts_real_marc_data(self, simple_book_bytes, multi_records_bytes):
        """Criterion 1: Scanner accepts real MARC files."""
        scanner = RecordBoundaryScanner()

        # Both files should be scannable
        boundaries1 = scanner.scan(simple_book_bytes)
        assert len(boundaries1) > 0

        boundaries2 = scanner.scan(multi_records_bytes)
        assert len(boundaries2) > 0

    def test_produces_valid_boundaries(self, multi_records_bytes):
        """Criterion 2: Boundaries are valid offsets into buffer."""
        scanner = RecordBoundaryScanner()
        boundaries = scanner.scan(multi_records_bytes)

        for offset, length in boundaries:
            # Offset should be within bounds
            assert 0 <= offset < len(multi_records_bytes)
            # Length should fit
            assert offset + length <= len(multi_records_bytes)
            # Should end with record terminator (0x1D)
            assert multi_records_bytes[offset + length - 1] == 0x1D

    def test_enables_parallel_parsing_readiness(self, multi_records_bytes):
        """Criterion 3: Output suitable for parallel processing."""
        scanner = RecordBoundaryScanner()
        boundaries = scanner.scan(multi_records_bytes)

        # Boundaries should be independent (non-overlapping)
        seen_ranges = set()
        for offset, length in boundaries:
            for i in range(offset, offset + length):
                assert i not in seen_ranges, "Overlapping boundaries"
                seen_ranges.add(i)

        # All scanned bytes should be covered exactly once
        # Note: May not cover entire file if last record is incomplete
        assert len(seen_ranges) == len(seen_ranges), "Should not have duplicates"
        
        # Verify boundaries don't exceed file size
        for offset, length in boundaries:
            assert offset + length <= len(multi_records_bytes), \
                f"Boundary exceeds file: {offset} + {length} > {len(multi_records_bytes)}"