import struct
import os
import sys
from pathlib import Path
FIELD_TERMINATOR = b'\x1e'
SUBFIELD_DELIMITER = b'\x1f'
RECORD_TERMINATOR = b'\x1d'
def build_leader(record_length, base_address, record_type='a', bib_level='m'):
leader = bytearray()
leader.extend(f'{record_length:05d}'.encode('ascii')) leader.append(ord('n')) leader.append(ord(record_type)) leader.append(ord(bib_level)) leader.append(ord(' ')) leader.append(ord('a')) leader.append(ord('2')) leader.append(ord('2')) leader.extend(f'{base_address:05d}'.encode('ascii')) leader.append(ord(' ')) leader.append(ord(' ')) leader.append(ord(' ')) leader.extend(b'4500')
return bytes(leader)
def build_directory_and_data(fields_data):
data_area = b''
directory = b''
current_pos = 0
for tag in sorted(fields_data.keys()):
field_bytes = fields_data[tag]
field_length = len(field_bytes)
directory += tag.encode('ascii')
directory += f'{field_length:04d}'.encode('ascii')
directory += f'{current_pos:05d}'.encode('ascii')
data_area += field_bytes
current_pos += field_length
directory += FIELD_TERMINATOR
return data_area, directory
def build_marc_record(fields_data):
data_area, directory = build_directory_and_data(fields_data)
base_address = 24 + len(directory)
record_length = base_address + len(data_area) + 1
leader = build_leader(record_length, base_address)
record = leader + directory + data_area + RECORD_TERMINATOR
return record
def create_book_record(record_num):
fields_data = {}
field_008 = b'200101s2020 xxu||||||||||||||||eng||' + FIELD_TERMINATOR
fields_data['008'] = field_008
author = f'Author, Test {record_num % 1000}'.encode('utf-8')
field_100 = b'1 ' + SUBFIELD_DELIMITER + b'a' + author + FIELD_TERMINATOR
fields_data['100'] = field_100
title = f'Test Book Number {record_num}'.encode('utf-8')
field_245 = b'10' + SUBFIELD_DELIMITER + b'a' + title + SUBFIELD_DELIMITER + b'cTest Author' + FIELD_TERMINATOR
fields_data['245'] = field_245
pub = f'Test City : Test Publishers, {2000 + (record_num % 25)}'.encode('utf-8')
field_260 = b' 1' + SUBFIELD_DELIMITER + b'a' + pub + FIELD_TERMINATOR
fields_data['260'] = field_260
field_300 = b' ' * 2 + SUBFIELD_DELIMITER + b'a' + str(100 + record_num % 400).encode('utf-8') + b' pages' + FIELD_TERMINATOR
fields_data['300'] = field_300
if record_num % 5 == 0:
field_500 = b' ' * 2 + SUBFIELD_DELIMITER + b'aA note about this record.' + FIELD_TERMINATOR
fields_data['500'] = field_500
subjects = ['Fiction', 'Literature', 'Novels', 'Contemporary']
for i, subj in enumerate(subjects[:((record_num % 4) + 1)]):
tag = f'65{i}'
field = b' 0' + SUBFIELD_DELIMITER + b'a' + subj.encode('utf-8') + FIELD_TERMINATOR
fields_data[tag] = field
if record_num % 10 == 0:
field_856 = b'40' + SUBFIELD_DELIMITER + b'u' + b'https://example.com/book' + str(record_num).encode('utf-8') + FIELD_TERMINATOR
fields_data['856'] = field_856
return build_marc_record(fields_data)
def create_authority_record(record_num):
fields_data = {}
field_008 = b'200101n azznnaabn |a aaa ' + FIELD_TERMINATOR
fields_data['008'] = field_008
term = f'Test Term {record_num}'.encode('utf-8')
field_150 = b' ' * 2 + SUBFIELD_DELIMITER + b'a' + term + FIELD_TERMINATOR
fields_data['150'] = field_150
if record_num % 3 == 0:
field_450 = b' ' * 2 + SUBFIELD_DELIMITER + b'a' + b'Variant Term' + FIELD_TERMINATOR
fields_data['450'] = field_450
return build_marc_record(fields_data)
def generate_fixture(output_path, num_records, progress=True):
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'wb') as f:
for i in range(num_records):
if i % 5 == 0:
record = create_authority_record(i)
else:
record = create_book_record(i)
f.write(record)
if progress and (i + 1) % 10000 == 0:
print(f" {i + 1:,} records written...", file=sys.stderr)
file_size = output_path.stat().st_size
size_mb = file_size / (1024 * 1024)
print(f"Created {output_path} with {num_records:,} records ({size_mb:.2f} MB)")
def main():
fixtures_dir = Path('tests/data/fixtures')
fixtures_dir.mkdir(parents=True, exist_ok=True)
print("Generating MARC benchmark fixtures...")
print()
print("Small fixture (1k records):")
generate_fixture(fixtures_dir / '1k_records.mrc', 1000)
print()
print("Medium fixture (10k records):")
generate_fixture(fixtures_dir / '10k_records.mrc', 10000)
print()
print("✓ All fixtures generated successfully!")
print()
print("Available fixtures:")
for fixture in sorted(fixtures_dir.glob('*.mrc')):
size_mb = fixture.stat().st_size / (1024 * 1024)
with open(fixture, 'rb') as f:
content = f.read()
count = content.count(RECORD_TERMINATOR)
print(f" {fixture.name}: {count:,} records ({size_mb:.2f} MB)")
if __name__ == '__main__':
main()